# -*- coding: utf-8 -*-
import scrapy
import datetime
from scrapy.loader import ItemLoader
from test1.items import xypjweb
from scrapyluke.processors import *
import re
import scrapyd

class spider(scrapy.Spider):
    name = 'xypj'
    start_urls = ['http://xypjweb.mofcom.gov.cn/IdentifyingCode']

    def parse(self, response):
        url_init = 'http://xypjweb.mofcom.gov.cn/pages/xypt/HypjCorpInfoLoginList_nav.pageNoLink.html?session=T&sp=0'
        yield scrapy.Request(url_init,callback= self.parse_go)

    def parse_go(self,response):
        #获取总页数
        total_page = response.xpath("//body[@id='Body']/div[@class='blank']/form[@id='listData']/section[@class='main clearfix mt130']/article[@class='listCon']/div[@class='page']/div/div[@id='Any']/span[@id='Insert_50']/text()").extract_first()
        total_page_potclear = total_page.replace(',','')
        total_page_num = re.findall(u'(\d+)',total_page_potclear)
        # print total_page_num[3]
        for i in range (0,int(total_page_num[3])):
            url_init = 'http://xypjweb.mofcom.gov.cn/pages/xypt/HypjCorpInfoLoginList_nav.pageNoLink.html?session=T&sp=' + str(i)
            yield scrapy.Request(url_init,callback= self.parse_next,dont_filter=True)

    def parse_next(self,response):
        url_ato_list1 = response.xpath('//a[@id = "ExternalLink"]/@href').extract()[0]
        url_1 = 'http://xypjweb.mofcom.gov.cn' + str(url_ato_list1)
        yield scrapy.Request(url_1,callback=self.parse_info)
        for url_num in range(0,9):
            #获取各级url
            str_check = 'ExternalLink_'+str(url_num)
            url_check = '//a[@id = "'+ str_check +'"]/@href'
            url_ato = response.xpath(url_check).extract()[0]
            url = 'http://xypjweb.mofcom.gov.cn' + url_ato
            yield scrapy.Request(url,callback=self.parse_info)

    def parse_info(self,response):
        cominfo = xypjweb()
        cominfo['url'] = response.url
        cominfo['insert_time'] = str(datetime.datetime.now())
        cominfo['company_name'] = response.xpath("//header[@class='title_02']/text()").extract_first().strip()
        cominfo['evaluation_grade'] = cominfo['evaluation_grade_en'] = response.xpath("//div[@class='lrBox clearfix']/p[@class='p3'][1]/text()").extract_first()
        cominfo['certificate_number'] = cominfo['certificate_number_en'] = response.xpath("//div[@class='lrBox clearfix']/p[@class='p2'][1]/text()").extract_first()
        cominfo['date_of_issue'] = response.xpath("//div[@class='lrBox clearfix']/p[@class='p2'][3]/text()").extract_first()
        cominfo['date_of_issue_en'] = response.xpath("//div[@class='lrBox clearfix']/p[@class='p2'][4]/text()").extract_first()
        cominfo['valid_until'] = response.xpath("//div[@class='lrBox clearfix']/p[@class='p2'][5]/text()").extract_first()
        cominfo['valid_until_en'] = response.xpath("//div[@class='lrBox clearfix']/p[@class='p2'][6]/text()").extract_first()
        cominfo['the_issuing_unit__association_'] = response.xpath("//div[@class='lrBox clearfix']/p[@class='p2'][7]/text()").extract_first()
        cominfo['the_issuing_unit__association_en'] = response.xpath("//div[@class='lrBox clearfix']/p[@class='p2'][8]/text()").extract_first()
        cominfo['the_evaluation_unit__credit_institution_or_association_'] = response.xpath("//section[@class='infoCon']/p[@class='p1'][1]/text()").extract_first()
        cominfo['the_evaluation_unit__credit_institution_or_association_en'] = response.xpath("//section[@class='infoCon']/p[@class='p1'][2]/text()").extract_first()
        cominfo['business_registration_no'] = response.xpath("//div[@class='blank']/form[@id='editForm']/section[@class='main clearfix mt130']/article[@class='infoMain']/article[@class='info_01 mt15']/section[@class='infoCon']/div[@class='lrBox clearfix']/p[@class='p2'][1]/text()").extract_first()
        cominfo['business_registration_no_en'] = cominfo['business_registration_no']
        cominfo['organization_code'] = response.xpath("//body[@id='Body']/div[@class='blank']/form[@id='editForm']/section[@class='main clearfix mt130']/article[@class='infoMain']/article[@class='info_01 mt15']/section[@class='infoCon']/div[@class='lrBox clearfix']/p[@class='p2'][3]/text()").extract_first()
        cominfo['organization_code_en'] = response.xpath("//body[@id='Body']/div[@class='blank']/form[@id='editForm']/section[@class='main clearfix mt130']/article[@class='infoMain']/article[@class='info_01 mt15']/section[@class='infoCon']/div[@class='lrBox clearfix']/p[@class='p2'][4]/text()").extract_first()
        cominfo['the_legal_representative'] = response.xpath("//body[@id='Body']/div[@class='blank']/form[@id='editForm']/section[@class='main clearfix mt130']/article[@class='infoMain']/article[@class='info_01 mt15']/section[@class='infoCon']/div[@class='lrBox clearfix']/p[@class='p2'][5]/text()").extract_first()
        cominfo['the_legal_representative_en'] = response.xpath("//body[@id='Body']/div[@class='blank']/form[@id='editForm']/section[@class='main clearfix mt130']/article[@class='infoMain']/article[@class='info_01 mt15']/section[@class='infoCon']/div[@class='lrBox clearfix']/p[@class='p2'][6]/text()").extract_first()
        cominfo['registered_capital'] = response.xpath("//body[@id='Body']/div[@class='blank']/form[@id='editForm']/section[@class='main clearfix mt130']/article[@class='infoMain']/article[@class='info_01 mt15']/section[@class='infoCon']/div[@class='lrBox clearfix']/p[@class='p2'][7]/text()").extract_first()
        cominfo['registered_capital_en'] = response.xpath("//body[@id='Body']/div[@class='blank']/form[@id='editForm']/section[@class='main clearfix mt130']/article[@class='infoMain']/article[@class='info_01 mt15']/section[@class='infoCon']/div[@class='lrBox clearfix']/p[@class='p2'][8]/text()").extract_first()
        cominfo['industry'] = response.xpath("//body[@id='Body']/div[@class='blank']/form[@id='editForm']/section[@class='main clearfix mt130']/article[@class='infoMain']/article[@class='info_01 mt15']/section[@class='infoCon']/div[@class='lrBox clearfix']/p[@class='p2'][9]/text()").extract_first()
        cominfo['industry_en'] = response.xpath("//body[@id='Body']/div[@class='blank']/form[@id='editForm']/section[@class='main clearfix mt130']/article[@class='infoMain']/article[@class='info_01 mt15']/section[@class='infoCon']/div[@class='lrBox clearfix']/p[@class='p2'][10]/text()").extract_first()
        cominfo['website'] = response.xpath("//body[@id='Body']/div[@class='blank']/form[@id='editForm']/section[@class='main clearfix mt130']/article[@class='infoMain']/article[@class='info_01 mt15']/section[@class='infoCon']/div[@class='lrBox clearfix']/p[@class='p2'][11]/text()").extract_first()
        cominfo['website_en'] = response.xpath("//body[@id='Body']/div[@class='blank']/form[@id='editForm']/section[@class='main clearfix mt130']/article[@class='infoMain']/article[@class='info_01 mt15']/section[@class='infoCon']/div[@class='lrBox clearfix']/p[@class='p2'][12]/text()").extract_first()
        cominfo['area'] = response.xpath("//body[@id='Body']/div[@class='blank']/form[@id='editForm']/section[@class='main clearfix mt130']/article[@class='infoMain']/article[@class='info_01 mt15']/section[@class='infoCon']/div[@class='lrBox clearfix']/p[@class='p2'][13]/text()").extract_first()
        cominfo['area_en'] = response.xpath("//body[@id='Body']/div[@class='blank']/form[@id='editForm']/section[@class='main clearfix mt130']/article[@class='infoMain']/article[@class='info_01 mt15']/section[@class='infoCon']/div[@class='lrBox clearfix']/p[@class='p2'][14]/text()").extract_first()
        cominfo['zip_code'] = response.xpath("//body[@id='Body']/div[@class='blank']/form[@id='editForm']/section[@class='main clearfix mt130']/article[@class='infoMain']/article[@class='info_01 mt15']/section[@class='infoCon']/div[@class='lrBox clearfix']/p[@class='p2'][15]/text()").extract_first()
        cominfo['zip_code_en'] = response.xpath("//body[@id='Body']/div[@class='blank']/form[@id='editForm']/section[@class='main clearfix mt130']/article[@class='infoMain']/article[@class='info_01 mt15']/section[@class='infoCon']/div[@class='lrBox clearfix']/p[@class='p2'][16]/text()").extract_first()
        cominfo['operating_address'] = response.xpath("//body[@id='Body']/div[@class='blank']/form[@id='editForm']/section[@class='main clearfix mt130']/article[@class='infoMain']/article[@class='info_01 mt15']/section[@class='infoCon']/p[@class='p1'][1]/text()").extract_first()
        cominfo['operating_address_en'] = response.xpath("//body[@id='Body']/div[@class='blank']/form[@id='editForm']/section[@class='main clearfix mt130']/article[@class='infoMain']/article[@class='info_01 mt15']/section[@class='infoCon']/p[@class='p1'][2]/text()").extract_first()
        cominfo['the_main_business'] = response.xpath("//body[@id='Body']/div[@class='blank']/form[@id='editForm']/section[@class='main clearfix mt130']/article[@class='infoMain']/article[@class='info_01 mt15']/section[@class='infoCon']/p[@class='p1'][3]/text()").extract_first()
        cominfo['the_main_business_en'] = response.xpath("//body[@id='Body']/div[@class='blank']/form[@id='editForm']/section[@class='main clearfix mt130']/article[@class='infoMain']/article[@class='info_01 mt15']/section[@class='infoCon']/p[@class='p1'][4]/text()").extract_first()
        # print cominfo['company_name']
        return cominfo
